In this notebook we are going to perform different experiments, at different levels: events, schedules, performances and tickets. Furthermore, this notebook allows us to select:
These values are introduced in the Selection Parameters section of this notebook.
Prerequisite: You need to have run (at least once) the Generate_List_Dataframes notebook, and have all the dataframes (df_places, df_new_events, df_p, dp_schedule_revenue, df_tickets, df_p_properties, etc.)
As follows we have the schema of events data.
The most importants remarks are:
-1. One event can have more than 1 schedule
-2. One schedule can last more than a day: start_ts and end_ts, could compromise a period of several days
-3. One scheudle can have more than 1 performance
-4. All performances of each schedule are always in the same place (schedule--> place_id).
-5. A performance has a timestamp (ts)
-6. A performance can have several tickets
-7. A ticket can have max_price and min_price
This is the list of experiments that we are going to perform.
Important: In order to run this notebook you need to have run the Generating_List_DataFrames notebook. Since, this notebook assume that we have already all the dataframes stored (df_places, df_new_events, df_p, dp_schedule_revenue, df_tickets, df_p_properties) in the "dataframe" directory.
LDA Topic Modelling: This could take a long time to run depending on the city selected.
import json
import pandas as pd
import plotly.express as px
import os
import pickle
import plotly.graph_objects as go
import numpy as np
from dateutil.relativedelta import relativedelta
from os import listdir
from os.path import isfile, join
import datetime
from scipy import stats
import datetime
import plotly.io as pio
#pio.renderers.default = "browser"
import plotly.offline as pyo
IMPORTANT You need to indicate the following 4 parameters.
These allows you to select the city, the list of categories to study, and the specific month to perform further analyses.
city="St Andrews"
list_categories=["Music", "Visual art", "Film", "Days out", "Books", "Comedy", "Theatre"]
month=3
month_string="March"
IMPORTANT: This notebook assumes that you have alredy generated all the necessary dataframes and stored them in a directory. If not, run first *Generate_List_Dataframes notebook.
### Change those if you want to use a different path or folder names.
### For the dataframe_path, as you did for Generate_List_Dataframes notebook.
dataframe_path="./dataframe"
## This path will be used in section 5.
models_path="./models"
if not os.path.exists(models_path):
os.makedirs(models_path)
#This path will be used in section 3 - maps
html_path="./html_figures"
if not os.path.exists(html_path):
os.makedirs(html_path)
def dataframe_groupby_size(df, column_list, rename, level=None, city=None, period="full"):
if len(column_list)==1:
column=column_list[0]
df_v1=df.groupby([column]).size().reset_index()
df_v1=df_v1.rename(columns={0: rename}).sort_values(by=[rename], ascending=False)
if city:
title= level+" " + rename+ " per "+ column + " at " +city
if period!= "full":
title = title + " for the month of " + month_string + " over the years"
fig_scatter=px.scatter(df_v1, x=column,y=rename, color=rename, size=rename, size_max=50, title=title)
fig_bar= px.bar(df_v1, x=column, y=rename, color=column, barmode='group', title=title)
else:
title= level+" " + rename+ " per "+ column
if period != "full":
title = title + " for the month of " + month_string + " over the years"
fig_scatter=px.scatter(df_v1, x=column,y=rename, color=rename, size=rename, size_max=50, title=title)
fig_bar= px.bar(df_v1, x=column, y=rename, color=column, barmode='group', title=title)
return df_v1, fig_scatter, fig_bar
else:
df_v1=df.groupby(column_list).size().reset_index()
df_v1=df_v1.rename(columns={0: rename})
return df_v1
def dataframe_groupby_sum(df, list_column, column2, level, rename=None, city=None, period="full"):
if len(list_column)==1:
column1=list_column[0]
df_v1=df.groupby([column1])[column2].sum().reset_index()
if not rename:
df_v1=df_v1.sort_values(by=[column2], ascending=False)
title=level+" " + column2+ " per "+ column1
if period !="full":
title=title + " for the month of " + month_string + " over the years"
fig_scatter=px.scatter(df_v1, x=column1,y=column2, color=column2, size=column2, size_max=50, title=title )
fig_bar= px.bar(df_v1, x=column1, y=column2, color=column1, barmode='group', title=title)
return df_v1, fig_scatter, fig_bar
else:
title=level+" " + rename+ " per "+ column1 + " at " + city
if period != "full":
title=title + " for the month of " + month_string + " over the years"
df_v1=df_v1.rename(columns={column2: rename}).sort_values(by=[rename], ascending=False).reset_index()
fig_bar = px.bar(df_v1, x=column1, y=rename, color=column1, barmode='group', title= title)
return df_v1, fig_bar
else:
df_v1=df.groupby(list_column)[column2].sum().reset_index()
df_v1=df_v1.rename(columns={column2: rename}).sort_values(by=[rename], ascending=False).reset_index()
return df_v1
def fig_histogram_color(df, x, column, level, city, y=None, period="full"):
if not y:
title= "Histogram of " + level.lower() + " per "+ column + " at " + city
if period !="full":
title= title + " for the month of " + month_string + " over the years"
fig=px.histogram(df, x=x, color=column, title=title)
fig.update_xaxes(
dtick="M1",
ticklabelmode="period")
return fig
else:
title= "Histogram of "+ level.lower()+ " " + y + " per "+ column + " at " + city
if period != "full":
title= title + " for the month of " + month_string + " over the years"
#nbins=12*num_years -->12*6
fig= px.histogram(df, x=x,y=y, color=column, title=title, nbins=72)
fig.update_xaxes(
dtick="M1",
ticklabelmode="period")
return fig
def fig_histogram_subtype(df, x, column, y, category, level, city, period='full'):
df_v1= df[df[column].isin([category])].sort_values(by=[x], ascending=True).reset_index()
title="Histogram of "+ category + " " + level.lower() + " " + y + " at " + city
if period!="full":
title=title + " for the month of " + month_string + " over the years"
#nbins=12*num_years --> 12*6
fig=px.histogram(df_v1, x=x,y=y, title=title, nbins=72)
fig.update_xaxes(
dtick="M1",
ticklabelmode="period")
return fig
def clean_documents(text):
text1 = re.sub(r'\S*@\S*\s?', '', text, flags=re.MULTILINE) # remove email
text1 = re.sub(r'http\S+', '', text1, flags=re.MULTILINE) # remove web addresses
text1 = re.sub("\'", "", text1) # remove single quotes
text2 = remove_stopwords(text1)
text2= re.sub('\s+(a|an|and|the|The|A|An|And)(\s+)', '\2', text2) # removes the whitespace
return text2
The original dataset are 9 files, which are snapshots of a serie events and places during a period of 6 months each:
And we "transformed" them into a serie of datataframes using the Generating_List_DataFrames notebook.
Important: An event can be present across more than one snaptshots (this means that is present in more than one file). To deal with this issue, we had "merged" together the information of the repeated events. During the merging phase, we got rid off of the repeated information, while preseving the parts that are not repeated (e.g. schedules, tags).
Furthermore, we have also created three new columns at event level:
Those are initilliazed as follows:
df_places = pd.read_pickle(dataframe_path+"/df_places")
df_events = pd.read_pickle(dataframe_path+"/df_new_events")
df_places[0:2]
| address | postal_code | properties | sort_name | town | website | place_id | modified_ts | created_ts | name | loc | country_code | tags | descriptions | phone_numbers | status | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5 York Place | admin@thestand.co.uk | EH1 3EB | {'place.child-restrictions': True, 'place.faci... | Stand | Edinburgh | http://www.thestand.co.uk | 1 | 2021-11-24T12:18:33Z | 2021-11-24T12:18:33Z | The Stand | {'latitude': '55.955806109395006', 'longitude'... | GB | [Bar & pub food, Comedy, Restaurants, Venues] | [{'type': 'description.list.default', 'descrip... | {'info': '0131 558 7272', 'box_office': '0131 ... | live |
| 1 | 10 Orwell Terrace | NaN | EH11 2DY | NaN | St Bride's Centre | Edinburgh | http://stbrides.wordpress.com | 371 | 2019-12-04T13:27:26Z | 2019-12-04T13:27:26Z | St Bride's Centre | {'latitude': '55.94255035', 'longitude': '-3.2... | GB | [Cinemas, Community centre, Public buildings, ... | [{'type': 'description.list.default', 'descrip... | {'info': '0131 346 1405'} | live |
df_events[0:2]
| event_id | modified_ts | created_ts | name | sort_name | status | id | schedules | descriptions | website | ... | category | properties | ranking_level | ranking_in_level | source_file | period_start | period_end | frequency_event | phone_numbers | alternative_names | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 157884 | 2021-01-13T18:46:26Z | 2007-12-06T17:18:12Z | Väsen | Väsen | live | 157884 | [{'start_ts': '2018-04-26T20:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.twoforjoy.co.uk | ... | Music | {'list:website:comments-end-date': '2013-01-31... | 3 | 2 | sample_20171101.json | 2017-11-01 | 2018-05-01 | 1 | NaN | NaN |
| 1 | 194419 | 2022-02-02T01:03:54Z | 2010-01-25T14:51:46Z | Martin Simpson | Martin Simpson | live | 194419 | [{'start_ts': '2018-03-10T19:30:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.martinsimpson.com/ | ... | Music | {'list:website:comments-end-date': '2020-01-28... | 2 | 1 | sample_20171101.json, sample_20180501.json | 2017-11-01 | 2018-11-01 | 2 | NaN | NaN |
2 rows × 21 columns
Important: At event level we dont have information of the places. So we can not filter events by city, neither by scheduling start/end times. But we do have the information to which snapshot period (period start and period end) each event belong to, along with the frequency of the repetition of each event across periods.
g_category, fig_scatter, fig_bar=dataframe_groupby_size(df_events, ['category'], 'frequency', 'Events')
fig_scatter.show()
fig_bar.show()
We are going to calculate the frequency of each category TAKE INTO ACCOUNT the frequency in which an event is repeated across periods.
g_category_v2, fig_scatter, fig_bar=dataframe_groupby_sum(df_events, ['category'], 'frequency_event', 'Events')
fig_scatter.show()
fig_bar.show()
Remember: An event has two columns with information about its snapshots period of times
Therefore, our events can start and ends at different periods of time - some examples are here:
g_category_start =dataframe_groupby_size(df_events, ['category', 'period_start'], 'frequency', 'Events')
g_category_start=g_category_start.sort_values(by=['frequency'], ascending=False)
px.scatter(g_category_start, x="category",y='period_start', color='frequency', size="frequency", size_max=60, title="Events frequency per category starting at the same period of time")
g_category_end =dataframe_groupby_size(df_events, ['category', 'period_end'], 'frequency', 'Events')
g_category_end=g_category_end.sort_values(by=['frequency'], ascending=False)
px.scatter(g_category_end, x="category",y='period_end', color='frequency', size="frequency", size_max=60, title="Events frequency per category ending at the same period of time")
fig = px.timeline(df_events, x_start="period_start", x_end="period_end", y="category", color="category")
fig.update_yaxes(autorange="reversed") # otherwise tasks are listed from the bottom up
fig.update_layout(title="Gantt chart of events grouped by categories in different period of time")
fig.update_xaxes(dtick="M2", ticklabelmode="period")
fig.show()
We have pre-calculated the revenue at the schedule level. Originally this information was not available.
1 Event can have 1 to N Schedules.
1 Schedule is in 1 Place
1 Schedule can have 1 to N Performances
1 Peformance can have 1 to N Tickets
1 Ticket has a max_price, min_price, currency.
df_schedule_revenue = pd.read_pickle(dataframe_path+"/df_schedule_revenue")
## Filter by city schedules
df_schedule_revenue_city=df_schedule_revenue[df_schedule_revenue['town'].isin([city])]
len(df_schedule_revenue)
33287
### df_schedule_revenue_city_festival : Just selecting the schedules starting at a particular month across years.
tz_info = df_schedule_revenue_city.iloc[0].start_ts.tzinfo
date1 = datetime.datetime(2017, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(2017, month+1, 1, tzinfo=tz_info)
df_schedule_revenue_city_festival=df_schedule_revenue_city[(df_schedule_revenue_city['start_ts']>= date1) \
& (df_schedule_revenue_city['start_ts'] < date2)]
for year in range(2018, 2022):
date1 = datetime.datetime(year, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(year, month+1, 1, tzinfo=tz_info)
tmp =df_schedule_revenue_city[(df_schedule_revenue_city['start_ts']>= date1) \
& (df_schedule_revenue_city['start_ts'] < date2)]
df_schedule_revenue_city_festival=pd.concat([df_schedule_revenue_city_festival, tmp], axis=0)
len(df_schedule_revenue_city_festival)
265
df_s_category, fig_scatter, fig_bar=dataframe_groupby_size(df_schedule_revenue_city, ['category'], 'frequency', 'Schedules', city)
fig_bar.show()
df_s_category_festival, fig_scatter, fig_bar=dataframe_groupby_size(df_schedule_revenue_city_festival, ['category'], 'frequency', 'Schedules', city, period="Month")
fig_bar.show()
fig_his=fig_histogram_color(df_schedule_revenue_city, "start_ts", "category", "Schedules", city)
fig_his.show()
df_s_city_category= dataframe_groupby_size(df_schedule_revenue_city, ['category', 'start_ts'] , "frequency")
for cat in list_categories:
fig=fig_histogram_subtype(df_s_city_category, "start_ts", "category", "frequency", cat, "schedules", city)
fig.show()
fig_his=fig_histogram_color(df_schedule_revenue_city_festival, "start_ts", "category", "Schedules", city)
fig_his.show()
df_s_city_category_festival= dataframe_groupby_size(df_schedule_revenue_city_festival, ['category', 'start_ts'] , "frequency", period="Month")
for cat in list_categories:
fig=fig_histogram_subtype(df_s_city_category_festival, "start_ts", "category", "frequency", cat, "schedules", city, period="Month")
fig.show()
df_s_revenue, fig=dataframe_groupby_sum(df_schedule_revenue_city, ['category'], 's_revenue', 'schedules', 'revenue', city)
fig.show()
df_s_revenue_festival, fig=dataframe_groupby_sum(df_schedule_revenue_city_festival, ['category'], 's_revenue', 'schedules', 'revenue', city, period="Month")
fig.show()
fig_his=fig_histogram_color(df_schedule_revenue_city, "start_ts", "category", "schedules", city, "s_revenue")
fig_his.show()
fig=px.histogram(df_schedule_revenue_city, x="start_ts", y="s_revenue", histfunc="sum", title="Monthly schedules revenue at " + city)
fig.update_traces(xbins_size="M1")
fig.update_xaxes(
dtick="M1",
ticklabelmode="period")
fig.show()
df_s_revenue_category=dataframe_groupby_sum(df_schedule_revenue_city, ['category', 'start_ts'], 's_revenue', 'schedules', 'revenue', city)
for cat in list_categories:
fig=fig_histogram_subtype(df_s_revenue_category, "start_ts", "category", "revenue", cat, "schedules", city)
fig.show()
fig_his=fig_histogram_color(df_schedule_revenue_city_festival, "start_ts", "category", "schedules", city, "s_revenue", period="Month")
fig_his.show()
df_s_revenue_category_festival=dataframe_groupby_sum(df_schedule_revenue_city_festival, ['category', 'start_ts'], 's_revenue', 'schedules', 'revenue', city, period="Month")
for cat in list_categories:
fig=fig_histogram_subtype(df_s_revenue_category_festival, "start_ts", "category", "revenue", cat, "schedules", city, period="Month")
fig.show()
df_p = pd.read_pickle(dataframe_path+"/df_p")
df_p_place = df_p.merge(df_places, on=['place_id','place_id'])
df_p_town=df_p_place.dropna(subset=['town'])
df_p_city=df_p_town[df_p_town['town'].isin([city])]
df_p_city['ts'] = pd.to_datetime(df_p_city['ts'], utc=True)
len(df_p_city)
/var/folders/4w/57rlv27n3g9b001td63pv1j00000gq/T/ipykernel_68825/277416203.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
4557
###df_p_city_festival : Just selecting the performances starting at a particular month across years.
tz_info = df_p_city.iloc[0].ts.tzinfo
date1 = datetime.datetime(2017, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(2017, month+1, 1, tzinfo=tz_info)
df_p_city_festival=df_p_city[(df_p_city['ts']>= date1) \
& (df_p_city['ts'] < date2)]
for year in range(2018, 2022):
date1 = datetime.datetime(year, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(year, month+1, 1, tzinfo=tz_info)
tmp =df_p_city[(df_p_city['ts']>= date1) \
& (df_p_city['ts'] < date2)]
df_p_city_festival=pd.concat([df_p_city_festival, tmp], axis=0)
len(df_p_city_festival)
797
df_p_category, fig_scatter, fig_bar=dataframe_groupby_size(df_p_city, ['category'], 'frequency', 'Performances', city)
fig_bar.show()
df_p_category, fig_scatter, fig_bar=dataframe_groupby_size(df_p_city_festival, ['category'], 'frequency', 'Performances', city, period="Month")
fig_bar.show()
fig_his=fig_histogram_color(df_p_city, "ts", "category", "Performances", city)
fig_his.show()
df_p_city_category= dataframe_groupby_size(df_p_city, ['category', 'ts'] , "frequency")
for cat in list_categories:
fig=fig_histogram_subtype(df_p_city_category, "ts", "category", "frequency", cat, "Performances", city)
fig.show()
fig_his=fig_histogram_color(df_p_city_festival, "ts", "category", "Performances", city, period="Month")
fig_his.show()
df_p_city_category_festival= dataframe_groupby_size(df_p_city_festival, ['category', 'ts'] , "frequency", period="Month")
for cat in list_categories:
fig=fig_histogram_subtype(df_p_city_category_festival, "ts", "category", "frequency", cat, "Performances", city, period="Month")
fig.show()
Important: The information of cancellations and sold-out are inside the performances properties. Therefore, here we need to work with the df_p_properties dataframe instead of df_p, because df_p does not have performances properties information.
df_p_properties = pd.read_pickle(dataframe_path+"/df_p_properties")
df_p_properties_place = df_p_properties.merge(df_places, on=['place_id','place_id'])
df_p_properties_town=df_p_properties_place.dropna(subset=['town'])
df_p_properties_city=df_p_properties_town[df_p_properties_town['town'].isin([city])]
df_p_properties_city['ts'] = pd.to_datetime(df_p_properties_city['ts'], utc=True)
len(df_p_properties_city)
/var/folders/4w/57rlv27n3g9b001td63pv1j00000gq/T/ipykernel_68825/2922426291.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
1115
###df_p_properties_festival : Just selecting the performances properties starting at a particular month across years.
tz_info = df_p_properties_city.iloc[0].ts.tzinfo
date1 = datetime.datetime(2017, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(2017, month+1, 1, tzinfo=tz_info)
df_p_properties_city_festival=df_p_properties_city[(df_p_properties_city['ts']>= date1) \
& (df_p_properties_city['ts'] < date2)]
for year in range(2018, 2022):
date1 = datetime.datetime(year, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(year, month+1, 1, tzinfo=tz_info)
tmp =df_p_properties_city[(df_p_properties_city['ts']>= date1) \
& (df_p_properties_city['ts'] < date2)]
df_p_properties_city_festival=pd.concat([df_p_properties_city_festival, tmp], axis=0)
len(df_p_properties_city_festival)
534
df_p_city_sold_out=df_p_properties_city.dropna(subset=['performance.sold-out'])
fig_his=fig_histogram_color(df_p_city_sold_out, "ts", "performance.sold-out", "Performances", city)
fig_his.show()
df_p_city_sold_out_fesitval=df_p_properties_city_festival.dropna(subset=['performance.sold-out'])
fig_his=fig_histogram_color(df_p_city_sold_out_fesitval, "ts", "performance.sold-out", "Performances", city, period="Month")
fig_his.show()
df_p_city_cancelled=df_p_properties_city.dropna(subset=['performance.cancelled'])
fig_his=fig_histogram_color(df_p_city_cancelled, "ts", "performance.cancelled", "Performances", city)
fig_his.show()
df_p_city_cancelled_festival=df_p_properties_city_festival.dropna(subset=['performance.cancelled'])
fig_his=fig_histogram_color(df_p_city_cancelled_festival, "ts", "performance.cancelled", "Performances", city, period="Month")
fig_his.show()
We are going to plot in maps the performances places over the years. Three types per visualizations:
from IPython.display import IFrame
if city=="St Andrews":
city_string="sta"
else:
city_string=city.lower()
px.set_mapbox_access_token(open("mapbox_token").read())
df_p_city2=df_p_city.dropna(subset=['loc'])
df_loc_city=pd.concat([df_p_city2.drop(['loc'], axis=1), df_p_city2['loc'].apply(pd.Series)], axis=1)
df_perf_loc_city= df_loc_city.groupby(['event_id', 'event_name', 'category', 'start_ts', 'end_ts', 'ts', 'town', 'place_id', 'name', 'latitude', 'longitude']).size().reset_index()
df_perf_loc_city=df_perf_loc_city.rename(columns={0: "number_of_perf"}).sort_values(by=['number_of_perf'], ascending=False)
df_perf_loc_city[['start_ts','end_ts', 'ts']] = df_perf_loc_city[['start_ts','end_ts', 'ts']].apply(pd.to_datetime)
df_perf_loc_city['latitude'] = df_perf_loc_city['latitude'].astype(float)
df_perf_loc_city['longitude'] = df_perf_loc_city['longitude'].astype(float)
df_perf_loc_city=df_perf_loc_city.round(8)
df_perf_loc_city['ts'] = pd.to_datetime(df_perf_loc_city['ts'], utc=True)
df_perf_loc_city=df_perf_loc_city.sort_values(by=['ts'], ascending=True)
##df_perf_loc_city["year"]=pd.DatetimeIndex(pd.to_datetime(df_perf_loc_city['ts'], utc=True)).year
df_perf_loc_city["month"]=pd.to_datetime(df_perf_loc_city['ts'], utc=True).dt.strftime('%m-%Y')
fig=px.scatter_mapbox(df_perf_loc_city, lat="latitude", lon="longitude", animation_frame="month",
title="Performances Places in "+ city +" from 2017 to 2022",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=20, zoom=10, hover_name="name")
## unc-comment this line if you want to show the map
#fig.show()
## comment these liens if you dont want save the map
fig_name='performance_places_'+city_string+'.html'
fig.write_html(html_path+'/'+fig_name)
### Trick: Just for displaying the map in the final HTML - you wont need to do this .
IFrame(src="https://storage.googleapis.com/case_study_list/performance_places_sta2.html", width=1000, height=600)
df_perf_loc_city_count=df_perf_loc_city.groupby(['latitude','longitude', 'month']).count().reset_index()
df_perf_loc_city_count=df_perf_loc_city_count.rename(columns={"event_id": "frequency"})
df_perf_loc_city_count=df_perf_loc_city_count.round(8)
df_perf_loc_city_count["name"]=None
df_perf_loc_city_count["place_id"]=None
for index, row in df_perf_loc_city_count.iterrows():
lat=row['latitude']
long=row['longitude']
place_id=df_perf_loc_city[(df_perf_loc_city.latitude == lat) & (df_perf_loc_city.longitude == long)]['place_id'].head(1).iloc[0]
place_name=df_perf_loc_city[(df_perf_loc_city.latitude == lat) & (df_perf_loc_city.longitude == long)]['name'].head(1).iloc[0]
df_perf_loc_city_count.at[index, 'name']=place_name
df_perf_loc_city_count.at[index, 'place_id']=place_id
df_perf_loc_city_count['month'] = pd.to_datetime(df_perf_loc_city_count['month'], utc=True)
df_perf_loc_city_count=df_perf_loc_city_count.sort_values(by=['month'], ascending=True)
df_perf_loc_city_count["month"]=pd.to_datetime(df_perf_loc_city_count['month'], utc=True).dt.strftime('%m-%Y')
max_freq=df_perf_loc_city_count["frequency"].max()
fig=px.scatter_mapbox(df_perf_loc_city_count, lat="latitude", lon="longitude", animation_frame="month",
color="frequency", size="frequency",
range_color=(0,max_freq),
title="Performances Places Frequency in "+ city + " from 2017 to 2022",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=30, zoom=10, hover_name="name")
## unc-comment this line if you want to show the map
#fig.show()
## comment these liens if you dont want save the map
fig_name='freq_performance_places_'+city_string+'.html'
fig.write_html(html_path+'/'+fig_name)
### Trick: Just for displaying the map in the final HTML - you wont need to do this .
IFrame(src="https://storage.googleapis.com/case_study_list/"+fig_name, width=1000, height=600)
(Just During the Selected Month Over the Years)
tz_info = df_perf_loc_city.iloc[0].ts.tzinfo
date1 = datetime.datetime(2017, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(2017, month+1, 1, tzinfo=tz_info)
city_festival=df_perf_loc_city[(pd.to_datetime(df_perf_loc_city['ts'], utc=True) >= date1) & (pd.to_datetime(df_perf_loc_city['ts'], utc=True) <= date2)]
for year in range(2018, 2022):
tz_info = df_perf_loc_city.iloc[0].ts.tzinfo
date1 = datetime.datetime(year, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(year, month+1, 1, tzinfo=tz_info)
tmp=df_perf_loc_city[(pd.to_datetime(df_perf_loc_city['ts'], utc=True) >= date1) & (pd.to_datetime(df_perf_loc_city['ts'], utc=True) <= date2)]
city_festival=pd.concat([city_festival, tmp], axis=0)
city_festival['ts'] = pd.to_datetime(city_festival['ts'], utc=True)
city_festival=city_festival.sort_values(by=['ts'], ascending=True)
##df_perf_loc_city["year"]=pd.DatetimeIndex(pd.to_datetime(df_perf_loc_city['ts'], utc=True)).year
city_festival["day"]=pd.to_datetime(city_festival['ts'], utc=True).dt.strftime('%Y-%m-%d')
fig=px.scatter_mapbox(city_festival, lat="latitude", lon="longitude", animation_frame="day",
animation_group="name" ,
title="Performances Places in "+ city + " During the of " + month_string+ " (2018 to 2021)",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=20, zoom=10, hover_name="name")
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 3000
## unc-comment this line if you want to show the map
#fig.show()
## comment these liens if you dont want save the map
fig_name='performance_places_'+city_string+'_'+month_string+'.html'
fig.write_html(html_path+'/'+fig_name)
### Trick: Just for displaying the map in the final HTML - you wont need to do this .
IFrame(src="https://storage.googleapis.com/case_study_list/"+fig_name, width=1000, height=600)
city_festival_count=city_festival.groupby(['latitude','longitude', 'day']).count().reset_index()
city_festival_count=city_festival_count.rename(columns={"event_id": "frequency"})
city_festival_count=city_festival_count.round(8)
city_festival_count["name"]=None
city_festival_count["place_id"]=None
for index, row in city_festival_count.iterrows():
lat=row['latitude']
long=row['longitude']
place_id=city_festival[(city_festival.latitude == lat) & (city_festival.longitude == long)]['place_id'].head(1).iloc[0]
place_name=city_festival[(city_festival.latitude == lat) & (city_festival.longitude == long)]['name'].head(1).iloc[0]
city_festival_count.at[index, 'name']=place_name
city_festival_count.at[index, 'place_id']=place_id
city_festival_count['day'] = pd.to_datetime(city_festival_count['day'])
city_festival_count=city_festival_count.sort_values(by=['day'], ascending=True)
city_festival_count['day']=city_festival_count['day'].dt.strftime('%Y-%m-%d')
max_frequency_place_fest=city_festival_count["frequency"].max()
fig=px.scatter_mapbox(city_festival_count, lat="latitude", lon="longitude", animation_frame="day",
#animation_group="category",
color= "frequency",
range_color=(0,max_frequency_place_fest),
size="frequency",
mapbox_style="carto-positron",
zoom=10,
title=" Performances Places Frequency in "+ city+ " During the of " + month_string+ " (2018 to 2021)",
size_max=50, hover_name="name")
#fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 50
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 3000
## unc-comment this line if you want to show the map
#fig.show()
## comment these liens if you dont want save the map
fig_name='freq_performance_places_'+city_string+'_'+month_string+'.html'
fig.write_html(html_path+'/'+fig_name)
### Trick: Just for displaying the map in the final HTML - you wont need to do this .
IFrame(src="https://storage.googleapis.com/case_study_list/"+fig_name, width=1000, height=600)
df_tickets = pd.read_pickle(dataframe_path+"/df_tickets")
df_t_place = df_tickets.merge(df_places, on=['place_id','place_id'])
df_t_town=df_t_place.dropna(subset=['town'])
df_t_city=df_t_town[df_t_town['town'].isin([city])]
df_t_city['ts'] = pd.to_datetime(df_t_city['ts'], utc=True)
len(df_t_city)
/var/folders/4w/57rlv27n3g9b001td63pv1j00000gq/T/ipykernel_68825/3613812720.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
5542
###df_p_city : Just selecting the performances properties starting at a particular month across years.
tz_info = df_t_city.iloc[0].ts.tzinfo
date1 = datetime.datetime(2017, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(2017, month+1, 1, tzinfo=tz_info)
df_t_city_festival=df_t_city[(df_t_city['ts']>= date1) \
& (df_t_city['ts'] < date2)]
for year in range(2018, 2022):
date1 = datetime.datetime(year, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(year, month+1, 1, tzinfo=tz_info)
tmp =df_t_city[(df_t_city['ts']>= date1) \
& (df_t_city['ts'] < date2)]
df_t_city_festival=pd.concat([df_t_city_festival, tmp], axis=0)
len(df_t_city_festival)
953
df_t_category, fig_scatter, fig_bar=dataframe_groupby_size(df_t_city, ['category'], 'frequency', 'Tickets', city)
fig_bar.show()
df_t_category_festival, fig_scatter, fig_bar=dataframe_groupby_size(df_t_city_festival, ['category'], 'frequency', 'Tickets', city, period="Month")
fig_bar.show()
fig_his=fig_histogram_color(df_t_city, "ts", "category", "Tickets", city)
fig_his.show()
df_t_type_city= df_t_city.dropna(subset=['type'])
fig_his=fig_histogram_color(df_t_type_city, "ts", "type", "Tickets", city)
fig_his.show()
fig_his=fig_histogram_color(df_t_city_festival, "ts", "category", "Tickets", city, period="Month")
fig_his.show()
df_t_type_city_festival= df_t_city_festival.dropna(subset=['type'])
fig_his=fig_histogram_color(df_t_type_city_festival, "ts", "type", "Tickets", city, period="Month")
fig_his.show()
## only max_price==0.0
df_f_price_city= df_t_city[df_t_city["t_revenue"]== 0.0]
fig_his=fig_histogram_color(df_f_price_city, "ts", "category", "Tickets", city)
fig_his.show()
## only max_price==0.0
df_f_price_city_festival= df_t_city_festival[df_t_city_festival["t_revenue"]== 0.0]
fig_his=fig_histogram_color(df_f_price_city_festival, "ts", "category", "Tickets", city, period="Month")
fig_his.show()
df_t_city_category= dataframe_groupby_size(df_t_city, ['category', 'ts'] , "frequency")
for cat in list_categories:
fig=fig_histogram_subtype(df_t_city_category, "ts", "category", "frequency", cat, "Tickets", city)
fig.show()
df_t_city_category_festival= dataframe_groupby_size(df_t_city_festival, ['category', 'ts'] , "frequency")
for cat in list_categories:
fig=fig_histogram_subtype(df_t_city_category_festival, "ts", "category", "frequency", cat, "Tickets", city, period="Month")
fig.show()
IMPORTAN This section is very computing-intensive. It could take a long time to run if you have selected a city with a high number of events ( e.g. Edinburgh)
We are going to calculate the topic modelling of the descriptions events placed for the selected city.
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.parsing.preprocessing import remove_stopwords
import re
from hdbscan import HDBSCAN
df_desc_town = pd.read_pickle(dataframe_path+"/df_desc_town")
df_desc_city=df_desc_town[df_desc_town['town'].isin([city])]
df_desc_city=df_desc_city.dropna(subset=['event_description']).reset_index()
documents=df_desc_city["event_description"].values
d=[]
for text in documents:
d.append(clean_documents(text))
#d.append(text)
len(d)
1998
#un-comment these lines if you want to generate and save the text-embeddings again
## Using all-mpnet-base-v2 Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')
#Training our text_embeddings - using the descriptions available & all-MiniLM-L6-v2 Transformer
text_embeddings = model.encode(d, batch_size = 8, show_progress_bar = True)
city_embeddings=np.array(text_embeddings)
if city == "St Andrews":
embeddings_name=models_path+"/sta_embeddings.npy"
else:
embeddings_name=models_path+"/"+city+"_embeddings.npy"
np.save(embeddings_name, city_embeddings)
#comment thes lines if you dont want to read the text_embeddings from file
if city == "St Andrews":
embeddings_name=models_path+"/sta_embeddings.npy"
else:
embeddings_name=models_path+"/"+city+"_embeddings.npy"
text_embeddings = np.load(embeddings_name)
We have tested two possible configurations for training the topic model:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# un-comment these lines if you want to train the topic model
### We have two options for generating the topic model
##### Option 1: Without HDBSCAN
#topic_model = BERTopic(verbose=True, nr_topics="auto", top_n_words=8).fit(d, text_embeddings)
#if city == "St Andrews":
# bertopic_name=models_path+"/sta_BertTopic_Model"
#else:
# bertopic_name=models_path+"/"+city+"_BertTopic_Model"
#topic_model.save(bertopic_name)
#### Option 2: Using HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean',
cluster_selection_method='eom', prediction_data=True, min_samples=5)
topic_model = BERTopic(hdbscan_model=hdbscan_model, nr_topics="auto").fit(d, text_embeddings)
if city == "St Andrews":
bertopic_name=models_path+"/sta_BertTopic_Model_HDBSCAN"
else:
bertopic_name=models_path+"/"+city+"_BertTopic_Model_HDBSCAN"
topic_model.save(bertopic_name)
#comment these lines if you dont want to read the topic modelling from the file
### Atention - You have two options again, with and without HDBSCAN
##### Option 1: Without HDBSCAN
if city == "St Andrews":
bertopic_name=models_path+"/sta_BertTopic_Model"
else:
bertopic_name=models_path+"/"+city+"sta_BertTopic_Model"
topic_model = BERTopic.load(bertopic_name)
#### Option 2: Using HDBSCAN
#if city == "St Andrews":
# bertopic_name=models_path+"/sta_BertTopic_Model_HDBSCAN"
#else:
# bertopic_name=models_path+"/"+city+"_BertTopic_Model_HDBSCAN"
#topic_model = BERTopic.load(bertopic_name)
len(d)
1998
#Lets see the frequency of the first 10 topics
topic_model.get_topic_freq()[0:10]
| Topic | Count | |
|---|---|---|
| 0 | -1 | 610 |
| 1 | 0 | 251 |
| 2 | 1 | 205 |
| 3 | 2 | 57 |
| 4 | 3 | 56 |
| 5 | 4 | 54 |
| 6 | 5 | 49 |
| 7 | 6 | 45 |
| 8 | 7 | 44 |
| 9 | 8 | 42 |
topics, probs = topic_model.fit_transform(d, text_embeddings)
topic_model.visualize_barchart()
topic_model.visualize_heatmap()
topic_model.visualize_topics()
#topic_model.get_topics()
topic_model.get_topic(0)
[('dance', 0.027559220391353875),
('st', 0.02354862222896757),
('andrews', 0.020818752173804935),
('performance', 0.01952703114565396),
('music', 0.019420941901859903),
('comedy', 0.01870962167013262),
('any', 0.018035794281127976),
('tickets', 0.018006437504891542),
('venue', 0.01730814323633089),
('theatre', 0.01585434229916706)]
df_topic_city=df_desc_city
df_topic_city['topic_lda'] = topics
df_topic_city[["category"]]=None
df_topic_city[["lda_topics"]]=None
df_topic_city[["topic_lda_name"]]=None
for index, row in df_topic_city.iterrows():
event_id=row['event_id']
category=df_events[df_events.event_id == event_id].category.values[0]
df_topic_city.at[index,"category"]=category
topic_num=row['topic_lda']
topic_list=topic_model.get_topic(topic_num)
t=[]
for i in topic_list:
t.append(i[0])
df_topic_city.at[index,"lda_topics"]=t
n_t=t[0:5]
pp=str(topic_num)+"_"+'_'.join(n_t)
df_topic_city.at[index, 'topic_lda_name']=pp
category_list=df_topic_city["category"].unique()
category_id={}
id=0
for i in category_list:
category_id[i]=id
id+=1
category_id
for index, row in df_topic_city.iterrows():
category=row['category']
df_topic_city.at[index, "category_id"]=category_id[category]
df_topic_city_category= df_topic_city.groupby(['event_id', 'category']).size().reset_index()
df_topic_city_category.rename(columns={0: "number_of_times"}, inplace=True)
df_topic_city_category_1= df_topic_city_category.groupby(['category'])['number_of_times'].sum().reset_index()
df_topic_city_category_1=df_topic_city_category_1.rename(columns={'number_of_times': "sum_num_of_times"}).sort_values(by=['sum_num_of_times'], ascending=False).reset_index()
df_topic_city_category_lda= df_topic_city.groupby(['topic_lda', 'category']).size().reset_index()
df_topic_city_category_lda.rename(columns={0: "number_of_times"}, inplace=True)
px.scatter(df_topic_city_category_lda, x="category",y='topic_lda', color='topic_lda', size="number_of_times", size_max=30, title="Frequency of events of per category")
px.scatter(df_topic_city_category_lda, x="topic_lda",y='category', color='category', size="number_of_times", size_max=30, title="Frequency of events of per category")